#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-Models/FirstPUModel/RMarkdowns')
library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally)
library(expss)
library(polycor)
library(foreach) ; library(doParallel)
library(knitr)
library(biomaRt)
suppressWarnings(suppressMessages(library(WGCNA)))
SFARI_colour_hue = function(r) {
pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}
Load preprocessed dataset (preprocessing code in 19_10_14_data_preprocessing.Rmd) and clustering (pipeline in 19_10_21_WGCNA.Rmd)
# Gandal dataset
load('./../Data/Gandal/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame
# GO Neuronal annotations
GO_annotations = read.csv('./../../../PhD-InitialExperiments/FirstYearReview/Data/GO_annotations/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID'=as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal'=1)
# SFARI Genes
SFARI_genes = read_csv('./../Data/SFARI/SFARI_genes_08-29-2019_with_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]
# Clusterings
clusterings = read_csv('./../Data/Gandal/clusters.csv')
# Update DE_info with SFARI and Neuronal information
genes_info = DE_info %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>%
mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
left_join(GO_neuronal, by='ID') %>% left_join(clusterings, by='ID') %>%
mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
mutate(gene.score=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`),
significant=padj<0.05 & !is.na(padj))
# Add gene symbol
getinfo = c('ensembl_gene_id','external_gene_id')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=genes_info$ID, mart=mart)
genes_info = genes_info %>% left_join(gene_names, by=c('ID'='ensembl_gene_id'))
clustering_selected = 'DynamicHybrid'
genes_info$Module = genes_info[,clustering_selected]
dataset = read.csv(paste0('./../Data/Gandal/dataset_', clustering_selected, '.csv'))
dataset$Module = dataset[,clustering_selected]
rm(DE_info, GO_annotations, clusterings)
Using the hetcor function, that calculates Pearson, polyserial or polychoric correlations depending on the type of variables involved.
datTraits = datMeta %>% dplyr::select(Diagnosis_, Region, Sex, Age, PMI, RNAExtractionBatch) %>%
rename('Diagnosis' = Diagnosis_, 'ExtractionBatch' = RNAExtractionBatch)
# Recalculate MEs with color labels
ME_object = datExpr %>% t %>% moduleEigengenes(colors = genes_info$Module)
MEs = orderMEs(ME_object$eigengenes)
# Calculate correlation between eigengenes and the traits and their p-values
moduleTraitCor = MEs %>% apply(2, function(x) hetcor(x, datTraits)$correlations[1,-1]) %>% t
rownames(moduleTraitCor) = colnames(MEs)
colnames(moduleTraitCor) = colnames(datTraits)
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nrow(datExpr))
# Create text matrix for the Heatmap
textMatrix = paste0(signif(moduleTraitCor, 2), ' (', signif(moduleTraitPvalue, 1), ')')
dim(textMatrix) = dim(moduleTraitCor)
# In case there are any NAs
if(sum(!complete.cases(moduleTraitCor))>0){
print(paste0(sum(is.na(moduleTraitCor)),' correlation(s) could not be calculated'))
}
rm(ME_object)
I’m going to select all the modules that have an absolute correlation higher than 0.9 with Diagnosis to study them
moduleTraitCor = moduleTraitCor[complete.cases(moduleTraitCor),]
moduleTraitPvalue = moduleTraitPvalue[complete.cases(moduleTraitCor),]
# Sort moduleTraitCor by Diagnosis
moduleTraitCor = moduleTraitCor[order(moduleTraitCor[,1], decreasing=TRUE),]
moduleTraitPvalue = moduleTraitPvalue[order(moduleTraitCor[,1], decreasing=TRUE),]
# Create text matrix for the Heatmap
textMatrix = paste0(signif(moduleTraitCor, 2), ' (', signif(moduleTraitPvalue, 1), ')')
dim(textMatrix) = dim(moduleTraitCor)
labeledHeatmap(Matrix = moduleTraitCor, xLabels = names(datTraits), yLabels = gsub('ME','',rownames(moduleTraitCor)),
yColorWidth=0, colors = brewer.pal(11,'PiYG'), bg.lab.y = gsub('ME','',rownames(moduleTraitCor)),
textMatrix = textMatrix, setStdMargins = FALSE, cex.text = 0.8, cex.lab.y = 0.75, zlim = c(-1,1),
main = paste('Module-Trait relationships'))
diagnosis_cor = data.frame('Module' = gsub('ME','',rownames(moduleTraitCor)),
'MTcor' = moduleTraitCor[,'Diagnosis'],
'MTpval' = moduleTraitPvalue[,'Diagnosis'])
genes_info = genes_info %>% left_join(diagnosis_cor, by='Module')
rm(moduleTraitPvalue, datTraits, textMatrix, diagnosis_cor)
I was expecting the modules to consist mainly of genes with the highest LFC (largest absolute values in PC2), and they are large, but many of the largest genes are not in these modules.
top_modules = gsub('ME','',rownames(moduleTraitCor)[abs(moduleTraitCor[,'Diagnosis'])>0.9])
cat(paste0('Top modules selected: ', paste(top_modules, collapse=', '),'\n'))
## Top modules selected: #9B8EFF, #F066EA, #8DAB00
pca = datExpr %>% prcomp
plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
left_join(dataset, by='ID') %>% dplyr::select(ID, PC1, PC2, Module, gene.score) %>%
mutate(ImportantModules = ifelse(Module %in% top_modules, as.character(Module), 'Others')) %>%
mutate(color = ifelse(ImportantModules=='Others','gray',ImportantModules),
alpha = ifelse(ImportantModules=='Others', 0.2, 0.4))
table(plot_data$ImportantModules)
##
## #8DAB00 #9B8EFF #F066EA Others
## 1277 215 1158 13950
ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=ImportantModules)) +
geom_point(alpha=plot_data$alpha, color=plot_data$color, aes(ID=ID)) + theme_minimal() +
ggtitle('Modules with strongest relation to Diagnosis'))
rm(pca)
create_plot = function(module){
plot_data = dataset %>% dplyr::select(ID, paste0('MM.',gsub('#','',module)), GS, gene.score) %>% filter(dataset$Module==module)
colnames(plot_data)[2] = 'Module'
SFARI_colors = as.numeric(names(table(as.character(plot_data$gene.score)[plot_data$gene.score!='None'])))
p = ggplotly(plot_data %>% ggplot(aes(Module, GS, color=gene.score)) + geom_point(alpha=0.5, aes(ID=ID)) + ylab('Gene Significance') +
scale_color_manual(values=SFARI_colour_hue(r=c(SFARI_colors,8))) + theme_minimal() + xlab('Module Membership') +
ggtitle(paste0('Module ', module,' (MTcor = ', round(moduleTraitCor[paste0('ME',module),1],2),')')))
return(p)
}
create_plot(top_modules[1])
create_plot(top_modules[2])
create_plot(top_modules[3])
rm(create_plot)
Modules with the strongest module-diagnosis correlation should have the highest percentage of SFARI Genes, but this doesn’t seem to be the case (even the opposite may be true)
plot_data = dataset %>% mutate('hasSFARIscore' = gene.score!='None') %>%
group_by(Module, MTcor, hasSFARIscore) %>% summarise(p=n()) %>%
left_join(dataset %>% group_by(Module) %>% summarise(n=n()), by='Module') %>%
mutate(p=round(p/n*100,2))
for(i in 1:nrow(plot_data)){
this_row = plot_data[i,]
if(this_row$hasSFARIscore==FALSE & this_row$p==100){
new_row = this_row
new_row$hasSFARIscore = TRUE
new_row$p = 0
plot_data = plot_data %>% rbind(new_row)
}
}
plot_data = plot_data %>% filter(hasSFARIscore==TRUE)
ggplotly(plot_data %>% ggplot(aes(MTcor, p, size=n)) + geom_smooth(color='gray', se=FALSE) +
geom_point(color=plot_data$Module, alpha=0.5, aes(id=Module)) + geom_hline(yintercept=mean(plot_data$p), color='gray') +
xlab('Module-Diagnosis correlation') + ylab('% of SFARI genes') +
theme_minimal() + theme(legend.position = 'none'))
rm(i, this_row, new_row, plot_data)
Since these modules have the strongest relation to autism, this pattern should be reflected in their model eigengenes, having two different behaviours for the samples corresponding to autism and the ones corresponding to control.
In all three cases, the Eigengenes separate the behaviour between autism and control samples very clearly!
plot_EGs = function(module){
plot_data = data.frame('ID' = rownames(MEs), 'MEs' = MEs[,paste0('ME',module)], 'Diagnosis' = datMeta$Diagnosis_)
p = plot_data %>% ggplot(aes(Diagnosis, MEs, fill=Diagnosis)) + geom_boxplot() + theme_minimal() + theme(legend.position='none') +
ggtitle(paste0('Module ', module, ' (MTcor=',round(moduleTraitCor[paste0('ME',module),1],2),')'))
return(p)
}
p1 = plot_EGs(top_modules[1])
p2 = plot_EGs(top_modules[2])
p3 = plot_EGs(top_modules[3])
grid.arrange(p1, p2, p3, nrow=1)
rm(plot_EGs, p1, p2, p3)
Selecting the modules with the highest correlation to Diagnosis, and, from them, the genes with the highest module membership-(absolute) gene significance
*Ordered by \(\frac{MM+|GS|}{2}\)
Only two SFARI Genes in the three lists, none from scores 1 and 2…
create_table = function(module){
top_genes = dataset %>% left_join(genes_info %>% dplyr::select(ID, external_gene_id), by='ID') %>%
dplyr::select(external_gene_id, paste0('MM.',gsub('#','',module)), GS, gene.score) %>%
filter(dataset$Module==module) %>% rename('MM' = paste0('MM.',gsub('#','',module))) %>%
mutate(importance = (MM+abs(GS))/2) %>% arrange(by=-importance) %>% top_n(10)
return(top_genes)
}
top_genes_1 = create_table(top_modules[1])
kable(top_genes_1, caption=paste0('Top 10 genes for module ', top_modules[1], ' (MTcor = ',
round(moduleTraitCor[paste0('ME',top_modules[1]),1],2),')'))
| external_gene_id | MM | GS | gene.score | importance |
|---|---|---|---|---|
| MCL1 | 0.8768314 | 0.9060349 | None | 0.8914332 |
| ITGA5 | 0.8613780 | 0.8646717 | None | 0.8630249 |
| TOB2 | 0.7916638 | 0.8719542 | None | 0.8318090 |
| SRGAP1 | 0.7680729 | 0.8951553 | None | 0.8316141 |
| FOXO1 | 0.7883168 | 0.8458336 | None | 0.8170752 |
| ITPRIP | 0.8218756 | 0.8042687 | None | 0.8130721 |
| LATS2 | 0.7719291 | 0.8433350 | None | 0.8076321 |
| ZNF516 | 0.7243000 | 0.8764275 | None | 0.8003638 |
| PARD3 | 0.6952386 | 0.8949607 | None | 0.7950996 |
| PLEKHG1 | 0.7711860 | 0.8073816 | None | 0.7892838 |
top_genes_2 = create_table(top_modules[2])
kable(top_genes_2, caption=paste0('Top 10 genes for module ', top_modules[2], ' (MTcor = ',
round(moduleTraitCor[paste0('ME',top_modules[2]),1],2),')'))
| external_gene_id | MM | GS | gene.score | importance |
|---|---|---|---|---|
| HIST1H2AC | 0.8413858 | 0.8648084 | None | 0.8530971 |
| ZBTB20 | 0.8640272 | 0.8406765 | 3 | 0.8523519 |
| CFLAR | 0.8807946 | 0.8010251 | None | 0.8409098 |
| HIST2H2BE | 0.7903735 | 0.8878029 | None | 0.8390882 |
| ELF1 | 0.7921926 | 0.7836121 | None | 0.7879024 |
| KANK1 | 0.8162030 | 0.7554489 | 4 | 0.7858260 |
| REST | 0.8091856 | 0.7472778 | None | 0.7782317 |
| SFMBT2 | 0.8039142 | 0.7457312 | None | 0.7748227 |
| ZNF334 | 0.7958118 | 0.7482581 | None | 0.7720349 |
| HEBP2 | 0.7890904 | 0.7462823 | None | 0.7676863 |
top_genes_3 = create_table(top_modules[3])
kable(top_genes_3, caption=paste0('Top 10 genes for module ', top_modules[3],' (MTcor = ',
round(moduleTraitCor[paste0('ME',top_modules[3]),1],2),')'))
| external_gene_id | MM | GS | gene.score | importance |
|---|---|---|---|---|
| MAPK9 | 0.9014380 | -0.9399810 | None | 0.9207095 |
| TRIM37 | 0.9067664 | -0.9250494 | None | 0.9159079 |
| PREPL | 0.8923376 | -0.8966280 | None | 0.8944828 |
| NAP1L5 | 0.8590945 | -0.8915541 | None | 0.8753243 |
| EIF5A2 | 0.8377986 | -0.9116043 | None | 0.8747015 |
| DIRAS1 | 0.8551447 | -0.8936753 | None | 0.8744100 |
| ATP6V1C1 | 0.8548700 | -0.8885175 | None | 0.8716938 |
| TTBK2 | 0.8740293 | -0.8654230 | None | 0.8697261 |
| PRKCE | 0.8345108 | -0.9028094 | None | 0.8686601 |
| ATP6V1A | 0.8277575 | -0.9068579 | None | 0.8673077 |
rm(create_table)
pca = datExpr %>% prcomp
plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
left_join(dataset, by='ID') %>% dplyr::select(ID, PC1, PC2, Module, gene.score) %>%
mutate(color = ifelse(Module %in% top_modules, as.character(Module), 'gray')) %>%
mutate(alpha = ifelse(color %in% top_modules &
ID %in% c(as.character(top_genes_1$ID),
as.character(top_genes_2$ID),
as.character(top_genes_3$ID)), 1, 0.1))
plot_data %>% ggplot(aes(PC1, PC2)) + geom_point(alpha=plot_data$alpha, color=plot_data$color) +
theme_minimal() + ggtitle('Important genes identified through WGCNA')
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Scientific Linux 7.6 (Nitrogen)
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] WGCNA_1.68 fastcluster_1.1.25 dynamicTreeCut_1.63-1
## [4] biomaRt_2.42.0 knitr_1.24 doParallel_1.0.15
## [7] iterators_1.0.12 foreach_1.4.7 polycor_0.7-10
## [10] expss_0.10.1 GGally_1.4.0 gridExtra_2.3
## [13] viridis_0.5.1 viridisLite_0.3.0 RColorBrewer_1.1-2
## [16] dendextend_1.13.2 plotly_4.9.1 glue_1.3.1
## [19] reshape2_1.4.3 forcats_0.4.0 stringr_1.4.0
## [22] dplyr_0.8.3 purrr_0.3.3 readr_1.3.1
## [25] tidyr_1.0.0 tibble_2.1.3 ggplot2_3.2.1
## [28] tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.1.5
## [3] Hmisc_4.2-0 BiocFileCache_1.10.2
## [5] plyr_1.8.5 lazyeval_0.2.2
## [7] splines_3.6.0 crosstalk_1.0.0
## [9] BiocParallel_1.20.1 GenomeInfoDb_1.22.0
## [11] robust_0.4-18.2 digest_0.6.23
## [13] htmltools_0.4.0 GO.db_3.10.0
## [15] fansi_0.4.1 magrittr_1.5
## [17] checkmate_1.9.4 memoise_1.1.0
## [19] fit.models_0.5-14 cluster_2.0.8
## [21] annotate_1.64.0 modelr_0.1.5
## [23] matrixStats_0.55.0 askpass_1.1
## [25] prettyunits_1.0.2 colorspace_1.4-1
## [27] blob_1.2.0 rvest_0.3.5
## [29] rappdirs_0.3.1 rrcov_1.4-7
## [31] haven_2.2.0 xfun_0.8
## [33] crayon_1.3.4 RCurl_1.95-4.12
## [35] jsonlite_1.6 genefilter_1.68.0
## [37] zeallot_0.1.0 impute_1.60.0
## [39] survival_2.44-1.1 gtable_0.3.0
## [41] zlibbioc_1.32.0 XVector_0.26.0
## [43] DelayedArray_0.12.2 BiocGenerics_0.32.0
## [45] DEoptimR_1.0-8 scales_1.1.0
## [47] mvtnorm_1.0-11 DBI_1.1.0
## [49] Rcpp_1.0.3 xtable_1.8-4
## [51] progress_1.2.2 htmlTable_1.13.1
## [53] foreign_0.8-71 bit_1.1-15.1
## [55] preprocessCore_1.48.0 Formula_1.2-3
## [57] stats4_3.6.0 htmlwidgets_1.5.1
## [59] httr_1.4.1 ellipsis_0.3.0
## [61] acepack_1.4.1 farver_2.0.3
## [63] pkgconfig_2.0.3 reshape_0.8.8
## [65] XML_3.98-1.20 nnet_7.3-12
## [67] dbplyr_1.4.2 locfit_1.5-9.1
## [69] later_1.0.0 labeling_0.3
## [71] tidyselect_0.2.5 rlang_0.4.2
## [73] AnnotationDbi_1.48.0 munsell_0.5.0
## [75] cellranger_1.1.0 tools_3.6.0
## [77] cli_2.0.1 generics_0.0.2
## [79] RSQLite_2.2.0 broom_0.5.3
## [81] fastmap_1.0.1 evaluate_0.14
## [83] yaml_2.2.0 bit64_0.9-7
## [85] fs_1.3.1 robustbase_0.93-5
## [87] nlme_3.1-139 mime_0.8
## [89] xml2_1.2.2 compiler_3.6.0
## [91] rstudioapi_0.10 curl_4.3
## [93] reprex_0.3.0 geneplotter_1.64.0
## [95] pcaPP_1.9-73 stringi_1.4.5
## [97] highr_0.8 lattice_0.20-38
## [99] Matrix_1.2-17 vctrs_0.2.1
## [101] pillar_1.4.3 lifecycle_0.1.0
## [103] data.table_1.12.8 bitops_1.0-6
## [105] httpuv_1.5.2 GenomicRanges_1.38.0
## [107] R6_2.4.1 latticeExtra_0.6-28
## [109] promises_1.1.0 IRanges_2.20.2
## [111] codetools_0.2-16 MASS_7.3-51.4
## [113] assertthat_0.2.1 SummarizedExperiment_1.16.1
## [115] openssl_1.4.1 DESeq2_1.26.0
## [117] withr_2.1.2 S4Vectors_0.24.2
## [119] GenomeInfoDbData_1.2.2 hms_0.5.3
## [121] grid_3.6.0 rpart_4.1-15
## [123] rmarkdown_1.14 Cairo_1.5-10
## [125] shiny_1.4.0 Biobase_2.46.0
## [127] lubridate_1.7.4 base64enc_0.1-3